Load packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(gghalves)
library(ggdist)
library(ggridges)
## 
## Attaching package: 'ggridges'
## 
## The following objects are masked from 'package:ggdist':
## 
##     scale_point_color_continuous, scale_point_color_discrete,
##     scale_point_colour_continuous, scale_point_colour_discrete,
##     scale_point_fill_continuous, scale_point_fill_discrete,
##     scale_point_size_continuous
library(ggh4x)
## 
## Attaching package: 'ggh4x'
## 
## The following object is masked from 'package:ggplot2':
## 
##     guide_axis_logticks
library(ggpp)
## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2
## 
## Attaching package: 'ggpp'
## 
## The following object is masked from 'package:ggh4x':
## 
##     stat_centroid
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(rmarkdown)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ordinal)
## 
## Attaching package: 'ordinal'
## 
## The following object is masked from 'package:dplyr':
## 
##     slice
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(entropy)
library(gmodels)
library(plotrix)
## 
## Attaching package: 'plotrix'
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## The following object is masked from 'package:scales':
## 
##     rescale
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
set.seed(314)

R.version
##                _                           
## platform       aarch64-apple-darwin20      
## arch           aarch64                     
## os             darwin20                    
## system         aarch64, darwin20           
## status                                     
## major          4                           
## minor          3.1                         
## year           2023                        
## month          06                          
## day            16                          
## svn rev        84548                       
## language       R                           
## version.string R version 4.3.1 (2023-06-16)
## nickname       Beagle Scouts

Load all the data

sociolex_demographics_words <- read_csv("sociolex_data/processed/sociolex_demographics_all_filtered_words.csv") # participant demographics
## Rows: 1254 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Response_ID, participant_email_address, participant_gender_categor...
## dbl (12): Progress, duration, participant_age, participant_gender_scale, par...
## lgl  (1): participant_language_other
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sociolex_ratings_long_words <- read_csv("sociolex_data/processed/sociolex_ratings_long_words.csv") # raw word ratings along the dimensions of gender, location, political alignment and valence
## Rows: 523218 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Response_ID, dimension, item, value, language, version
## dbl (1): value1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sociolex_ratings_long_words_age <- read_csv("sociolex_data/processed/sociolex_ratings_long_words_age.csv") # raw word ratings along the dimension of age
## Rows: 1002448 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Response_ID, item, age_category, dimension
## dbl (3): n, age_sum, age_rating_aggregated
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sociolex_demographics_images <- read_csv("sociolex_data/processed/sociolex_demographics_all_filtered_images.csv") # raw picture ratings along the dimensions of gender, location, political alignment and valence
## Rows: 455 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): Response_ID, participant_email_address, participant_gender_category...
## dbl (7): Progress, duration, participant_age, participant_gender_scale, part...
## lgl (7): participant_gender_category_other, participant_language_other, timi...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sociolex_ratings_long_images <- read_csv("sociolex_data/processed/sociolex_ratings_long_images.csv") # raw picture ratings along the dimensions of gender, location, political alignment and valence
## Rows: 181901 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Response_ID, item, language, version, value, dimension
## dbl (1): value1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
extra_words <- read_delim("sociolex_data/coding/extra_words.csv", delim = ",") %>%
  pull(value)
## Rows: 113 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
words_coding <- read_delim("sociolex_data/coding/words_coding.csv", delim = ",")
## Rows: 3999 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): item, image, word_czech, word_english, grammatical_gender, animacy...
## dbl  (1): multiple
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sociolex_coding <- read_csv("sociolex_data/coding/sociolex_coding.csv") %>%
  mutate(version = str_to_sentence(version),
         version2 = ifelse(version != "Words", version, NA),
         version = ifelse(!is.na(version2), "Images", version),
         item = ifelse(version == "Images", paste0(item, "_", tolower(version2)), item)) %>%
  select(-multiple, -semantic_category)
## Rows: 4010 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (13): item, image, word_czech, word_english, semantic_category, grammati...
## dbl  (4): multiple, role_noun, animal_person, multiple_word
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Socio-demographic characteristics of our sample

Both data sets are mainly based on data from our university pool. Apart from the basic demographic details such as age or native language, we also asked participants to self-report on additional information, namely, to situate themselves along four dimensions: liberal-conservative, pessimistic-optimistic, rural-urban, masculine-feminine. This will give us more detail about a sample that comes from a population that is traditionally used but perhaps rather specific.

#clean up the demographic data to make the plot easier to make
demographics_plot_data_words <- sociolex_demographics_words %>%
  pivot_longer(ends_with("_scale"), names_to = "dimension", values_to = "rating") %>%
  mutate(dimension = str_remove(dimension, "participant_"),
         dimension = str_remove(dimension, "_scale"),
         rating = as.numeric(as.character(rating)),
         dimension1 = dimension,
         dimension = as.numeric(factor(dimension)),
         participant_gender_category = ifelse(is.na(participant_gender_category_other), participant_gender_category,
                                              ifelse(participant_gender_category_other == "nebinární|nonbinary", "non-binary", participant_gender_category)),
         participant_gender_category = if_else(participant_gender_category == "male", "male", if_else(participant_gender_category == "female", "female", "non-binary/self-report")),
         experiment = "Experiment 1: Words")

demographics_plot_data_words2 <- demographics_plot_data_words %>%
  distinct(participant_gender_category, Response_ID, experiment) %>%
  group_by(participant_gender_category, experiment) %>%
  count(name = "participant_gender_category1") %>%
  mutate(participant_gender_category1 = paste0(participant_gender_category,
                                               "\n(n = ",
                                               participant_gender_category1,
                                               ")"))

demographics_plot_data_words <- demographics_plot_data_words %>%
  left_join(demographics_plot_data_words2)
## Joining with `by = join_by(participant_gender_category, experiment)`
#get the number of participants in each demographic question option, e.g. the number of very liberal participants
demographics_plot_data_words1 <- demographics_plot_data_words %>%
  mutate(rating = factor(as.character(rating))) %>%
  group_by(participant_gender_category1, dimension, dimension1, rating, experiment, .drop = FALSE) %>%
  count() %>%
  mutate(rating = as.numeric(as.character(rating))) %>% 
  filter(is.na(rating) == FALSE) %>%
  mutate(experiment = "Experiment 1: Words")
#clean up the demographic data to make the plot easier to make
demographics_plot_data_images <- sociolex_demographics_images %>%
  pivot_longer(ends_with("_scale"), names_to = "dimension", values_to = "rating") %>%
  mutate(dimension = str_remove(dimension, "participant_"),
         dimension = str_remove(dimension, "_scale"),
         rating = as.numeric(as.character(rating)),
         dimension1 = dimension,
         dimension = as.numeric(factor(dimension)),
         participant_gender_category = ifelse(is.na(participant_gender_category_other), participant_gender_category,
                                              ifelse(participant_gender_category_other == "nebinární|nonbinary", "non-binary", participant_gender_category)),
         participant_gender_category = if_else(participant_gender_category == "male", "male", if_else(participant_gender_category == "female", "female", "non-binary/self-report")),
         experiment = "Experiment 2: Images")

demographics_plot_data_images2 <- demographics_plot_data_images %>%
  distinct(participant_gender_category, Response_ID, experiment) %>%
  group_by(participant_gender_category, experiment) %>%
  count(name = "participant_gender_category1") %>%
  mutate(participant_gender_category1 = paste0(participant_gender_category,
                                               "\n(n = ",
                                               participant_gender_category1,
                                               ")"))

demographics_plot_data_images <- demographics_plot_data_images %>%
  left_join(demographics_plot_data_images2)
## Joining with `by = join_by(participant_gender_category, experiment)`
#get the number of participants in each demographic question option, e.g. the number of very liberal participants
demographics_plot_data_images1 <- demographics_plot_data_images %>%
  mutate(rating = factor(as.character(rating))) %>%
  group_by(participant_gender_category1, dimension, dimension1, rating, experiment, .drop = FALSE) %>%
  count() %>%
  mutate(rating = as.numeric(as.character(rating))) %>% 
  filter(is.na(rating) == FALSE) %>%
  mutate(experiment = "Experiment 2: Images")
demographics_plot_data <- demographics_plot_data_words %>%
  bind_rows(demographics_plot_data_images)

demographics_plot_data1 <- demographics_plot_data_words1 %>%
  bind_rows(demographics_plot_data_images1)
demographics_plot <- demographics_plot_data %>%
  ggplot(aes(x = rating, y = dimension, group = dimension, colour = dimension1, fill = dimension1)) +
  stat_density_ridges(quantile_lines = TRUE, quantiles = 2, alpha = 0.4, scale = 0.8, bandwidth = 0.4) +
  geom_segment(aes(x = 0, xend = 0, y = dimension, yend = dimension + 0.8), colour = "black", linetype = 2) +
  geom_text(data = demographics_plot_data1, aes(label = n), position = position_nudge(y = -0.1), size = 2) +
  # geom_text(data = demographics_plot_data %>%
  #             distinct(participant_gender_category, Response_ID) %>%
  #             group_by(participant_gender_category) %>%
  #             count(),
  #           aes(x = -2.8, y = 5, label = paste0("n = ", n)), size = 4, hjust = 0, inherit.aes = FALSE) +
  # annotate(geom = "text", x = -3, y = 5, label = "", size = 0) +
  scale_x_continuous(breaks = c(-3, -2, -1, 0, 1, 2, 3), limits = c(-3.1, 3.1), expand = c(0.02, 0.02)) +
  scale_y_continuous(breaks = 1:4, labels = c("masculine", "rural", "pessimistic", "liberal"),
                     sec.axis = sec_axis(~., breaks = 1:4, labels = c("feminine", "urban", "optimistic", "conservative")), limits = c(0.8, 4.8)) +
  # scale_color_manual(values = purple_blue_palette) +
  # scale_fill_manual(values = purple_blue_palette) +
  facet_nested_wrap(~experiment+participant_gender_category1) +
  xlab(NULL) +
  ylab(NULL) +
  theme_bw() +
  theme(legend.position = "none",
        strip.text = element_text(size = 12),
        axis.text = element_text(size = 10),
        panel.grid = element_line(colour = "#f9f9f9"))

#show the plot
demographics_plot

#save the plot to the plots folder
ggsave(plot = demographics_plot, filename = "plots/demographics_plot.png", height = 8, width = 8, dpi = 400)
sociolex_demographics_words %>%
  summarise(median_age = median(participant_age),
            sd_age = sd(participant_age),
            min_age = min(participant_age),
            max_age = max(participant_age)) %>%
  bind_cols(sociolex_demographics_words %>%
              group_by(participant_gender_category) %>%
              count() %>%
              pivot_wider(names_from = participant_gender_category, values_from = n))
sociolex_demographics_images %>%
  summarise(median_age = median(participant_age),
            sd_age = sd(participant_age),
            min_age = min(participant_age),
            max_age = max(participant_age)) %>%
  bind_cols(sociolex_demographics_images %>%
              group_by(participant_gender_category) %>%
              count() %>%
              pivot_wider(names_from = participant_gender_category, values_from = n))

Analysing gender, location, political allignment and valence ratings

In this part, we will focus only on scales where participants could only use one option per word, i.e. gender, location, political, and valence. Namely, we will make following steps:

  1. assess reliability of the ratings,
  2. aggregate the data,
  3. compare the ratings with ratings from other languages,
  4. calculate basic descriptive statistics,
  5. correlate dimensions between each other.

Reliability: Experiment 1 Words

To assess reliability of the ratings, we will calculate Chronbachs’s alpha. First, we need to get the data into a suitable format. We will work with the raw data which we will further divide based on the survey version. Cronbach’s alpha will be calculated for each survey version separately.

We will remove any items that were not known by >50% of the participants.

sociolex_ratings_long_words %>%
  filter(!item %in% extra_words) %>%
  group_by(item) %>%
  summarise(n = n(),
            n_known = sum(!is.na(value1)),
            prop_known = n_known/n) %>%
  filter(prop_known < 0.5)

Then, we will create a data frame with columns for Response_ID, item, survey_version, dimesnion and rating.

reliability <- sociolex_ratings_long_words %>%
  filter(!item %in% extra_words, # get rid of control items
         !item %in% c("talmud", "imám")) %>% # get rid of an item with low number of ratings (cronbach's alpha could not handle these)
  rename(rating = value1) %>%
  left_join(sociolex_demographics_words %>%
              select(Response_ID, survey_version)) %>%
  select(Response_ID, item, survey_version, dimension, rating) %>%
  mutate(survey_version = str_remove(survey_version, "PRIMUS_"),
         survey_version = str_replace(survey_version, "prolific", "survey"))
## Joining with `by = join_by(Response_ID)`

Now, we will divide the data into separate objects. There will be a separate object for every survey and every dimension, e.g. reliability_survey_7_valence containing all valence ratings of words from the survey seven. To calculate the Cronbach’s alpha using the alpha() function from the psych package, the objects will be structured in a following way: each row will represent one participant, each column will represent one word.

# Get unique survey versions
unique_versions <- unique(reliability$survey_version)

# Create a list of data frames, each containing rows for a specific survey version
list_of_versions <- split(reliability, reliability$survey_version)

# Iterate over each survey version data frame
for (version in unique_versions) {
  # Get the subset corresponding to the current survey version
  subset_df <- list_of_versions[[version]]
  
  # Get unique dimensions within the subset
  unique_dimensions <- unique(subset_df$dimension)
  
  # Split the subset based on the "dimension" column
  list_of_dimension_splits <- split(subset_df, subset_df$dimension)
  
  # Save each dimension split as a new object
  for (dimension in unique_dimensions) {
    # Get the data frame for the current dimension
    df <- list_of_dimension_splits[[dimension]]
    
    # Transform the data frame
    transformed_df <- df %>%
      select(-survey_version, -dimension) %>%  # Remove survey_version and dimension columns
      pivot_wider(names_from = item, values_from = rating) %>%  # Pivot the data frame
      select(-Response_ID)  # Remove the Response_ID column
    
    # Use assign to replace the original data frame with the transformed one
    assign(paste("reliability_", gsub(" ", "_", version), "_", gsub(" ", "_", dimension), sep = ""), transformed_df, envir = .GlobalEnv)
  }
}

Now, we will loop through all the objects and get the alphas.

# List all objects in the global environment
all_objects <- ls()

# Filter objects starting with "reliability_"
reliability_objects <- grep("^reliability_", all_objects, value = TRUE)

# List to store alpha results
alpha_results <- list()

# Loop through each reliability data frame
for (obj_name in reliability_objects) {
  # Get the data frame
  df <- get(obj_name)
  
  # Calculate Cronbach's alpha
  tryCatch({
    alpha_result <- alpha(df, check.keys = TRUE)
    alpha_value <- alpha_result$total$raw_alpha  # Extract raw alpha value
    alpha_results[[obj_name]] <- alpha_value
  }, error = function(e) {
    cat("Error calculating Cronbach's alpha for object:", obj_name, "\n")
    cat("Error message:", conditionMessage(e), "\n\n")
  })
}

# Print the raw alpha values
alpha_results_all <- alpha_results %>%
  unlist() %>%
  as.data.frame() %>%
  rownames_to_column(var = "survey") %>%
  rename(alpha = 2) %>%
  arrange(alpha) %>%
  mutate(survey1 = str_remove_all(survey, "reliability_|_words"),
         survey1 = str_replace(survey1, "survey_", "survey")) %>%
  separate(survey1, into = c("survey1", "dimension")) %>%
  select(-survey) %>%
  pivot_wider(names_from = dimension, values_from = alpha)

As a last step, we will print out range of the alpha values for each of the dimensions.

alpha_results_all %>%
  pivot_longer(-survey1, names_to = "dimension", values_to = "alpha") %>%
  group_by(dimension) %>%
  summarise(min = round(min(alpha), 2),
            max = round(max(alpha), 2))

Due to the fact that for example Lynott et al. (2019) accepted all ratings with alpha values higher than .8, our values seem satisfactory.

Let’s clean the environment.

# List all objects in the environment
objects <- ls()

# Identify objects starting with "reliability"
reliability_objects <- objects[grep("^reliability", objects)]

# Remove identified objects
rm(list = reliability_objects)

Reliability: Experiment 2 Images

To assess reliability of the ratings, we will calculate Chronbachs’s alpha. First, we need to get the data into a suitable format. We will work with the raw data which we will further divide based on the survey version. Cronbach’s alpha will be calculated for each survey version separately.

We will remove any items that were not known by >50% of the participants.

sociolex_ratings_long_images %>%
  group_by(item, version) %>%
  summarise(n = n(),
            n_known = sum(!is.na(value1)),
            prop_known = n_known/n) %>%
  filter(prop_known < 0.5)
## `summarise()` has grouped output by 'item'. You can override using the
## `.groups` argument.

Then, we will create a data frame with columns for Response_ID, item, survey_version, dimesnion and rating.

reliability <- sociolex_ratings_long_images %>%
  mutate(item = paste0(item, "_", version)) %>%
  filter(!item %in% c("PICTURE_146_gray", "PICTURE_192_colour", "PICTURE_192_gray", "PICTURE_616_colour", "PICTURE_616_gray")) %>% # get rid of an item with low number of ratings (cronbach's alpha could not handle these)
  rename(rating = value1) %>%
  left_join(sociolex_demographics_images %>%
              select(Response_ID, survey_version)) %>%
  select(Response_ID, item, survey_version, dimension, rating) %>%
  mutate(survey_version = str_remove(survey_version, "PRIMUS_"),
         survey_version = str_replace(survey_version, "prolific", "survey"))
## Joining with `by = join_by(Response_ID)`

Now, we will divide the data into separate objects. There will be a separate object for every survey and every dimension, e.g. reliability_survey_7_valence containing all valence ratings of words from the survey seven. To calculate the Cronbach’s alpha using the alpha() function from the psych package, the objects will be structured in a following way: each row will represent one participant, each column will represent one word.

# Get unique survey versions
unique_versions <- unique(reliability$survey_version)

# Create a list of data frames, each containing rows for a specific survey version
list_of_versions <- split(reliability, reliability$survey_version)

# Iterate over each survey version data frame
for (version in unique_versions) {
  # Get the subset corresponding to the current survey version
  subset_df <- list_of_versions[[version]]
  
  # Get unique dimensions within the subset
  unique_dimensions <- unique(subset_df$dimension)
  
  # Split the subset based on the "dimension" column
  list_of_dimension_splits <- split(subset_df, subset_df$dimension)
  
  # Save each dimension split as a new object
  for (dimension in unique_dimensions) {
    # Get the data frame for the current dimension
    df <- list_of_dimension_splits[[dimension]]
    
    # Transform the data frame
    transformed_df <- df %>%
      select(-survey_version, -dimension) %>%  # Remove survey_version and dimension columns
      pivot_wider(names_from = item, values_from = rating) %>%  # Pivot the data frame
      select(-Response_ID)  # Remove the Response_ID column
    
    # Use assign to replace the original data frame with the transformed one
    assign(paste("reliability_", gsub(" ", "_", version), "_", gsub(" ", "_", dimension), sep = ""), transformed_df, envir = .GlobalEnv)
  }
}

Now, we will loop through all the objects and get the alphas.

# List all objects in the global environment
all_objects <- ls()

# Filter objects starting with "reliability_"
reliability_objects <- grep("^reliability_", all_objects, value = TRUE)

# List to store alpha results
alpha_results <- list()

# Loop through each reliability data frame
for (obj_name in reliability_objects) {
  # Get the data frame
  df <- get(obj_name)
  
  # Calculate Cronbach's alpha
  tryCatch({
    alpha_result <- alpha(df, check.keys = TRUE)
    alpha_value <- alpha_result$total$raw_alpha  # Extract raw alpha value
    alpha_results[[obj_name]] <- alpha_value
  }, error = function(e) {
    cat("Error calculating Cronbach's alpha for object:", obj_name, "\n")
    cat("Error message:", conditionMessage(e), "\n\n")
  })
}
## 
## Likely variables with missing values are  PICTURE_187_colour PICTURE_234_colour  
## Error calculating Cronbach's alpha for object: reliability_multipic_01_location 
## Error message: I am sorry: missing values (NAs) in the correlation matrix do not allow me to continue.
## Please drop those variables and try again.
# Print the raw alpha values
alpha_results_all <- alpha_results %>%
  unlist() %>%
  as.data.frame() %>%
  rownames_to_column(var = "survey") %>%
  rename(alpha = 2) %>%
  arrange(alpha) %>%
  mutate(survey1 = str_remove_all(survey, "reliability_"),
         survey1 = str_replace(survey1, "multipic_", "multipic")) %>%
  separate(survey1, into = c("survey1", "dimension")) %>%
  select(-survey) %>%
  pivot_wider(names_from = dimension, values_from = alpha)

As a last step, we will print out range of the alpha values for each of the dimensions.

alpha_results_all %>%
  pivot_longer(-survey1, names_to = "dimension", values_to = "alpha") %>%
  group_by(dimension) %>%
  summarise(min = round(min(alpha, na.rm = TRUE), 2),
            max = round(max(alpha, na.rm = TRUE), 2))

Due to the fact that for example Lynott et al. (2019) accepted all ratings with alpha values higher than .8, our values seem satisfactory.

Let’s clean the environment.

# List all objects in the environment
objects <- ls()

# Identify objects starting with "reliability"
reliability_objects <- objects[grep("^reliability", objects)]

# Remove identified objects
rm(list = reliability_objects)

Descriptives

sociolex_ratings_summary <- sociolex_ratings_long_words %>%
  filter(!item %in% extra_words) %>%
  mutate(version = "Words",
         version1 = "Exp 1:\nWords") %>%
  bind_rows(sociolex_ratings_long_images %>%
              mutate(version2 = version,
                     version = "Images",
                     version1 = paste0("Exp 2:\nImages")) %>%
              filter(!str_detect(item, "_MOUSE"))) %>%
  group_by(item, dimension, version, version1, version2) %>%
  summarise(mean = mean(value1, na.rm = TRUE),
            sd = sd(value1, na.rm = TRUE),
            n = n(),
            n_known = sum(!is.na(value1)),
            prop_known = n_known/n) %>%
  ungroup()
## `summarise()` has grouped output by 'item', 'dimension', 'version', 'version1'.
## You can override using the `.groups` argument.
sociolex_ratings_summary_label1 <- tibble(dimension = rep(unique(sociolex_ratings_summary$dimension), 4),
                                         mean = c(rep(3, 4), rep(-3, 4), rep(3, 4), rep(-3, 4)),
                                         hjust = c(rep(1, 4), rep(0, 4), rep(1, 4), rep(0, 4)),
                                         label = rep(c("fem", "urban", "cons", "pos",
                                                   "masc", "rural", "lib", "neg"), 2),
                                         version = c(rep("Words", 8), rep("Images", 8)),
                                         version1 = c(rep("Exp 1:\nWords", 8), rep("Exp 2:\nImages", 8))) %>%
  left_join(sociolex_ratings_summary %>%
              group_by(version, version1, dimension) %>%
              summarise(mean2 = mean(mean),
                        mean1 = paste0("mean = ", round(mean2, 2)),
                        sd = sd(mean),
                        se = std.error(mean),
                        lower_ci = ci(mean)[2],
                        upper_ci = ci(mean)[3],
                        n = n())
            # %>%
            #   mutate(se = sd/sqrt(n),
            #          lower.ci = mean - qt(1 - (0.05 / 2), n - 1) * se,
            #          upper.ci = mean + qt(1 - (0.05 / 2), n - 1) * se)
            ) %>%
  ungroup()
## `summarise()` has grouped output by 'version', 'version1'. You can override
## using the `.groups` argument.
## Joining with `by = join_by(dimension, version, version1)`
sociolex_ratings_summary_plot <- sociolex_ratings_summary %>%
  ggplot(aes(x = mean, y = version1, colour = mean)) +
  geom_point(position = position_jitternudge(height = 0.15,
                                             nudge.from = "jittered",
                                             y = -0.25),
             size = 0.2,
             alpha = 0.2) +
  geom_vline(xintercept = 0,
             linetype = 2,
             inherit.aes = FALSE,
             colour = "grey",
             size = 1) +
  geom_density_ridges(colour = "black",
                      fill = NA,
                      scale = 0.4,
                      trim = TRUE,
                      quantile_lines = TRUE,
                      quantiles = c(0.25, 0.5, 0.75),
                      vline_color = c("black")) +
  # geom_density_ridges(aes(height = ..density..),
  #                     fill = NA,
  #                     stat = "density",
  #                     trim = TRUE,
  #                     scale = 0.4, bandwidth = 0.2) +
  geom_label(data = sociolex_ratings_summary_label1,
             aes(y = 0.25, label = label, hjust = hjust, vjust = -0.2)) +
  annotate("label",
           x = 0,
           y = 0.25,
           label = "neut",
           colour = "gray",
           fill = "#ffffbf",
           vjust = -0.2,
           inherit.aes = FALSE) +
  geom_linerange(data = sociolex_ratings_summary_label1,
                 aes(xmin = mean2 - sd, xmax = mean2 + sd),
                 colour = "black",
                 size = 0.5,
                 position = position_nudge(y = -0.05)) +
  geom_point(data = sociolex_ratings_summary_label1,
             aes(x = mean2),
             colour = "black",
             size = 2,
             position = position_nudge(y = -0.05)) +
  scale_x_continuous(breaks = seq(-3, 3, 1),
                     limits = c(-3.2, 3.2),
                     name = "Mean rating") +
  scale_y_discrete(labels=c("Exp 1:\nWords" = "Exp 1:\nWords\n(n = 2,999)","Exp 2:\nImages" = "Exp 2:\nImages\n(n = 1,000)")) +
  scale_colour_gradient2(low = "#2c7fb8",
                         midpoint = 0,
                         mid = "#ffffbf",
                         high = "#f03b20",
                         breaks = seq(-3.2, 3.2, 1),
                         limits = c(-3.2, 3.2)) +
  facet_wrap(~dimension, nrow = 1) +
  theme_bw() +
  theme(legend.position = "none",
        axis.title.y = element_blank(),
        axis.text.y = element_text(hjust = 0))

sociolex_ratings_summary_plot
## Picking joint bandwidth of 0.188
## Picking joint bandwidth of 0.21
## Picking joint bandwidth of 0.105
## Picking joint bandwidth of 0.175

ggsave(plot = sociolex_ratings_summary_plot, filename = "plots/ratings_summary1.png", width = 10, height = 3, dpi = 400)
## Picking joint bandwidth of 0.188
## Picking joint bandwidth of 0.21
## Picking joint bandwidth of 0.105
## Picking joint bandwidth of 0.175
sociolex_ratings_summary %>%
  group_by(version) %>%
  summarise(median_n = median(n),
            min_n = min(n),
            max_n = max(n),
            median_n_known = median(n_known),
            min_n_known = min(n_known),
            max_n_known = max(n_known),
            mean_prop_known = mean(prop_known),
            min_prop_known = min(prop_known),
            max_prop_known = max(prop_known))

Latent means

Traditionally norms are summarised by the mean and SD. However, an alternative is presented by Taylor et al. (2021), who highlight that means are not always suitable for ordinal data, which our data is. We follow the advice of Taylor et al. and use cumulative link mixed models to extract the estimates for each word from models predicting the raw ratings, whilst also accounting for random variation from participant rating biases.

Taylor, J. E., Rousselet, G. A., Scheepers, C., & Sereno, S. C. (2021, August 3). Rating Norms Should be Calculated from Cumulative Link Mixed Effects Models. https://doi.org/10.31234/osf.io/3vgwk

We will convert the ratings to an ordered factor and then process dimension after dimension.

citation("ordinal")
## To cite 'ordinal' in publications use:
## 
##   Christensen R (2023). _ordinal-Regression Models for Ordinal Data_. R
##   package version 2023.12-4,
##   <https://CRAN.R-project.org/package=ordinal>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {ordinal---Regression Models for Ordinal Data},
##     author = {Rune H. B. Christensen},
##     year = {2023},
##     note = {R package version 2023.12-4},
##     url = {https://CRAN.R-project.org/package=ordinal},
##   }
#convert the ratings to an ordered factor
rating_data_ordinal <- sociolex_ratings_long_words %>%
  bind_rows(sociolex_ratings_long_images %>%
              mutate(item1 = item,
                     item = paste0(item1, "_", version))) %>%
  select(-value) %>%
  filter(!item %in% extra_words) %>%
  pivot_wider(names_from = dimension, values_from = value1) %>%
  mutate(gender = factor(gender, levels = -3:3, ordered = TRUE),
         location = factor(location, levels = -3:3, ordered = TRUE),
         political = factor(political, levels = -3:3, ordered = TRUE),
         valence = factor(valence, levels = -3:3, ordered = TRUE))
#gender
m_gender <- clmm(gender ~ 1 + (1 | item) + (1 | Response_ID),
                 data = rating_data_ordinal,
                 link = "probit")

item_estimates_gender <- ranef(m_gender)$item %>%
  # store the item IDs in the item_id column rather than in row names
  as_tibble(rownames = "item") %>%
  # rename the random effects variable to something more informative than "(Intercept)"
  rename(gender = `(Intercept)`)
#location
m_location <- clmm(location ~ 1 + (1 | item) + (1 | Response_ID),
                   data = rating_data_ordinal,
                   link = "probit")

# extract the item random effects
item_estimates_location <- ranef(m_location)$item %>%
  # store the item IDs in the item_id column rather than in row names
  as_tibble(rownames = "item") %>%
  # rename the random effects variable to something more informative than "(Intercept)"
  rename(location = `(Intercept)`)
#political
m_political <- clmm(political ~ 1 + (1 | item) + (1 | Response_ID),
                    data = rating_data_ordinal,
                    link = "probit")

# extract the item random effects
item_estimates_political <- ranef(m_political)$item %>%
  # store the item IDs in the item_id column rather than in row names
  as_tibble(rownames = "item") %>%
  # rename the random effects variable to something more informative than "(Intercept)"
  rename(political = `(Intercept)`)
#valence
m_valence <- clmm(valence ~ 1 + (1 | item) + (1 | Response_ID),
                  data = rating_data_ordinal,
                  link = "probit")

# extract the item random effects
item_estimates_valence <- ranef(m_valence)$item %>%
  # store the item IDs in the item_id column rather than in row names
  as_tibble(rownames = "item") %>%
  # rename the random effects variable to something more informative than "(Intercept)"
  rename(valence = `(Intercept)`)

Now we just bind the dimensions and add the latent means to the summary data.

latent_means <- item_estimates_gender %>%
  left_join(item_estimates_location) %>%
  left_join(item_estimates_political) %>%
  left_join(item_estimates_valence) %>%
  arrange(item) %>%
  # mutate(version = "Exp 1:\nWords") %>%
  pivot_longer(gender:valence, names_to = "dimension", values_to = "latent_mean")
## Joining with `by = join_by(item)`
## Joining with `by = join_by(item)`
## Joining with `by = join_by(item)`
sociolex_ratings_summary <- sociolex_ratings_summary %>%
  mutate(item = ifelse(version == "Images", paste0(item, "_", version2), item)) %>%
  left_join(latent_means)
## Joining with `by = join_by(item, dimension)`

TRADITIONAL VERSUS LATENT MEANS

To summarise the ratings for words, we used two measures: standard mean and latent mean. The former one is traditional, used in almost all ratings study but does not take into account the fact that a Likert scale response is not a continuous variable. Participants may use the scale in different ways which means we cannot assume the distances between the points on the scales are equal, e.g. that the difference between “neutral” and “slightly feminine” is the same as between “slightly feminine” and “feminine”. The only thing we can assume is that all participants understood the order of the scale, e.g. that “slightly feminine” is somewhere between “neutral” and “feminine”. The latent means take this variance into account by treating participants as random effects.

We will compare the standard and the latent means to see to what extent they differ. If the difference is vast, we will use the theoretically better motivated latent means for the further analyses. However, if the difference is not strong, we will stick with the traditional means.

Now we calculate the correlation between the mean and the standard mean for all dimensions.

sociolex_ratings_summary %>%
  group_by(dimension) %>%
  summarise(correlation = cor(mean, latent_mean))

It is clear the two calculations yield almost the same results. Next, we will visualize the relationship of latent and standard means.

sociolex_ratings_summary %>%
  ggplot(aes(x = mean, y = latent_mean)) +
  # geom_vline(xintercept = 0, linetype = 2) +
  # geom_hline(yintercept = 0, linetype = 2) +
  geom_point(size = 0.1) +
  geom_abline(slope = 1, intercept = 0, linetype = 2) +
  geom_smooth(method = "lm", color = "blue", size = 0.5) +
  scale_y_continuous(breaks = seq(-3, 3, 1)) +
  facet_wrap(~dimension) +
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

We can see that the two metrics differ only in the extremes with the latent means yielding more pronounced scores. This, however, concerns only a small portion of the data. In the following analyses, we will stick to using the traditional means.

Comparison to existing data

warriner_valence <- read_delim("secondary_data/warriner.csv", delim = ",") %>%
  select(Word, V.Mean.Sum) %>%
  rename(word_english = Word,
         warriner_valence_mean = V.Mean.Sum)
## New names:
## Rows: 13915 Columns: 65
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): Word dbl (64): ...1, V.Mean.Sum, V.SD.Sum, V.Rat.Sum, A.Mean.Sum,
## A.SD.Sum, A.Rat...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
scott_gender <- read_delim("secondary_data/glasgow.csv", delim = ";") %>%
  select(Words, valence_mean, gender_mean) %>%
  rename(word_english = Words,
         scott_valence_mean = valence_mean,
         scott_gender_mean = gender_mean)
## Rows: 5553 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr  (1): Words
## dbl (28): Length, arousal_mean, arousal_sd, arousal_n, valence_mean, valence...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sociolex_ratings_summary1 <- sociolex_ratings_summary %>%
  filter(version == "Words") %>%
  mutate(version = tolower(version)) %>%
  left_join(words_coding)
## Joining with `by = join_by(item, version)`
warriner_valence %>%
  filter(word_english %in% sociolex_ratings_summary1$word_english) %>%
  left_join(sociolex_ratings_summary1 %>%
              filter(dimension == "valence"))
## Joining with `by = join_by(word_english)`
scott_gender %>%
  filter(word_english %in% sociolex_ratings_summary1$word_english)
sociolex_ratings_summary_extra <- sociolex_ratings_summary1 %>%
  select(item, word_czech, word_english, dimension, mean) %>%
  pivot_wider(names_from = dimension, values_from = mean) %>%
  left_join(warriner_valence) %>%
  left_join(scott_gender)
## Joining with `by = join_by(word_english)`
## Joining with `by = join_by(word_english)`
sociolex_ratings_summary_extra %>%
  ggplot(aes(x = valence, y = warriner_valence_mean)) +
  geom_point(size = 0.5, alpha = 0.5) +
  theme_bw()

cor.test(sociolex_ratings_summary_extra$valence, sociolex_ratings_summary_extra$warriner_valence_mean, use = "complete")
## 
##  Pearson's product-moment correlation
## 
## data:  sociolex_ratings_summary_extra$valence and sociolex_ratings_summary_extra$warriner_valence_mean
## t = 70.276, df = 1544, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8604126 0.8842057
## sample estimates:
##       cor 
## 0.8728268
cor.test(sociolex_ratings_summary_extra$valence, sociolex_ratings_summary_extra$scott_valence_mean, use = "complete")
## 
##  Pearson's product-moment correlation
## 
## data:  sociolex_ratings_summary_extra$valence and sociolex_ratings_summary_extra$scott_valence_mean
## t = 77.74, df = 1164, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9059105 0.9245050
## sample estimates:
##       cor 
## 0.9156965
cor.test(sociolex_ratings_summary_extra$gender, sociolex_ratings_summary_extra$scott_gender_mean, use = "complete")
## 
##  Pearson's product-moment correlation
## 
## data:  sociolex_ratings_summary_extra$gender and sociolex_ratings_summary_extra$scott_gender_mean
## t = -29.837, df = 1164, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6896507 -0.6244978
## sample estimates:
##        cor 
## -0.6583054

Proportions

sociolex_ratings_summary_proportions <- sociolex_ratings_long_words %>%
  bind_rows(sociolex_ratings_long_images %>%
              mutate(item1 = item,
                     item = paste0(item1, "_", version))) %>%
  filter(!is.na(value1),
         !item %in% extra_words) %>%
  group_by(item, dimension) %>%
  count(value1, .drop = FALSE) %>%
  pivot_wider(names_from = value1, values_from = n, values_fill = 0) %>%
  pivot_longer(c(-item, -dimension), names_to = "rating", values_to = "count") %>%
  mutate(rating = factor(rating, levels = c(-3, -2, -1, 0, 1, 2, 3), ordered = TRUE)) %>%
  arrange(item, dimension, rating) %>%
  group_by(item, dimension) %>%
  mutate(sum = sum(count)) %>%
  ungroup() %>%
  mutate(prop = count/sum)

Entropy

sociolex_ratings_summary_entropy <- sociolex_ratings_long_words %>%
  bind_rows(sociolex_ratings_long_images %>%
              mutate(item1 = item,
                     item = paste0(item1, "_", version))) %>%
  filter(!is.na(value1),
         !item %in% extra_words) %>%
  mutate(value1 = factor(value1, levels = c(-3, -2, -1, 0, 1, 2, 3), ordered = TRUE)) %>%
  group_by(item, dimension) %>%
  count(value1, .drop = FALSE) %>%
  pivot_wider(names_from = value1, values_from = n, values_fill = 0) %>%
  ungroup() %>%
  pivot_longer(c(-item, -dimension), names_to = "rating", values_to = "count") %>%
  group_by(item, dimension) %>%
  summarise(entropy = entropy(count)) %>%
  ungroup()
## `summarise()` has grouped output by 'item'. You can override using the
## `.groups` argument.
sociolex_ratings_summary <- sociolex_ratings_summary %>%
  left_join(sociolex_ratings_summary_entropy)
## Joining with `by = join_by(item, dimension)`
sociolex_ratings_summary %>%
  ggplot(aes(x = sd, y = entropy)) +
  geom_point(size = 0.2, alpha = 0.2) +
  facet_wrap(~dimension) +
  theme_bw()

sociolex_ratings_summary %>%
  ggplot(aes(x = mean, y = entropy)) +
  geom_point(size = 0.2, alpha = 0.2) +
  facet_wrap(~dimension) +
  theme_bw()

Age

Proportions and modes

sociolex_ratings_proportion_age <- sociolex_ratings_long_words_age %>%
  filter(!item %in% extra_words) %>% # get rid of the control items
  mutate(age_category = factor(age_category)) %>%
  group_by(item, age_category, .drop = FALSE) %>%
  summarise(sum_rating = sum(age_rating_aggregated, na.rm = TRUE), # how many participants chose each option
            # n_age = sum(is.na(age_rating_aggregated)),
            n_known_age = sum(!is.na(age_rating_aggregated))) %>% # how many participants knew the word
  ungroup() %>%
  mutate(age_category = str_remove(age_category, "age_")) %>%
  mutate(prop = sum_rating/n_known_age) %>% # get the proportion for each option
  select(-sum_rating) %>%
  pivot_wider(names_from = age_category, values_from = prop) %>%
  mutate(sum = `0_6`+`07_17`+`18_30`+`31_50`+`51_65`+`66_80`+`81+`+žádný) %>%
  rename(`no age` = žádný)
## `summarise()` has grouped output by 'item'. You can override using the
## `.groups` argument.

We will also calculate the mode age category (which category has the highest prop value).

sociolex_ratings_proportion_age <- sociolex_ratings_proportion_age %>%
  pivot_longer(c(`0_6`:`no age`), names_to = "age_category", values_to = "prop") %>%
  group_by(item) %>%
  slice_max(prop) %>%
  add_count() %>%
  slice_sample(n = 1) %>%
  ungroup() %>%
  rename(age_mode = age_category) %>%
  mutate(age_mode = str_replace(age_mode, "_", "-")) %>%
  mutate(age_mode = factor(age_mode, levels = c("0-6", "07-17", "18-30", "31-50", "51-65", "66-80", "81+", "no age"))) %>%
  left_join(sociolex_ratings_proportion_age)
## Joining with `by = join_by(item, n_known_age, sum)`

How many times participants chose each option?

age_palette <- c("#fde725", "#90d743", "#35b779", "#21918c", "#31688e", "#443983", "#440154", "#C0C0C0")

sociolex_ratings_long_words_age %>%
  filter(!item %in% extra_words) %>% # get rid of the control items
  mutate(age_category = factor(age_category)) %>%
  mutate(age_rating = if_else(age_category == "age_žádný" & age_sum == 0, 1, age_sum*age_rating_aggregated)) %>%
  group_by(age_category, .drop = FALSE) %>%
  summarise(sum_rating = sum(age_rating, na.rm = TRUE)) %>% 
  ggplot(aes(x = age_category, y = sum_rating)) +
  geom_bar(stat = "identity", width = 0.8, aes(fill = age_category)) +
  scale_fill_manual(values = age_palette) +
  labs(x = "Age Category", y = "Sum Rating") +
  theme_bw() +
  theme(legend.position = "none",
        plot.margin = margin(10, 10, 10, 10),
        axis.text.x = element_text(size = 10, angle = 45, vjust = 0.5)) +
  scale_x_discrete(labels = c("age_žádný" = "No Age", 
                              "age_0_6" = "0-6", 
                              "age_07_17" = "7-17",
                              "age_18_30" = "18-30",
                              "age_31_50" = "31-50",
                              "age_51_65" = "51-65",
                              "age_66_80" = "66-80",
                              "age_81+" = "81+"))

How many modes has each option?

sociolex_ratings_proportion_age %>%
  count(age_mode) %>%
  arrange(age_mode)

What is the distribution of props in the modes?

sociolex_ratings_proportion_age %>%
  ggplot(aes(x = age_mode, y = prop, fill = age_mode)) +
  geom_boxplot() +
  scale_fill_manual(values = age_palette) +
  theme_bw() +
  theme(legend.position = "none")

age_prop_plot <- sociolex_ratings_proportion_age %>%
  select(item, `0_6`:`no age`) %>%
  pivot_longer(-item) %>%
  ggplot(aes(x = name, y = value, colour = name)) +
  geom_point(position = position_jitternudge(x = -0.2, width = 0.2, nudge.from = "jittered"), size = 0.2, alpha = 0.2) +
  geom_boxplot(outlier.shape = NA, width = 0.2, position = position_nudge(x = 0.1)) +
  # geom_hline(yintercept = 0, linetype = 2) +
  # geom_point(position = position_jitternudge(height = 0.15, nudge.from = "jittered", y = -0.25), size = 0.2, alpha = 0.2) +
  # # geom_vline(xintercept = 0, linetype = 2, inherit.aes = FALSE, colour = "grey", size = 1) +
  # # geom_density_ridges(colour = NA, fill = NA, scale = 0.4, trim = TRUE, quantile_lines = TRUE, quantiles = c(0.25, 0.5, 0.75), vline_color = c("black")) +
  # geom_density_ridges(aes(height = ..density..), fill = NA, stat = "density", trim = TRUE, scale = 0.4) +
  scale_color_manual(values = age_palette) +
  xlab("") +
  ylab("Proportion") +
  # facet_grid(~name, labeller = labeller(name = c(age_PC1 = "PC1", age_PC2 = "PC2", age_PC3 = "PC3"))) +
  theme_bw() +
  theme(legend.position = "none")

age_prop_plot

PCA

We will use the age data to explore if there is underlying structure that can be interpreted by reducing the dimensions down. This is done by running a PCA on the proportions from each of the 7 age categories (excluding the none option).

pca_age1 <- sociolex_ratings_proportion_age %>%
  # select(item, `0_6`:`81+`) %>%
  # pivot_longer(`0_6`:`81+`) %>%
  # mutate(logit_prop = log(value+1/(1-value+1)),
  #        arcsin_prop = asin(sqrt(value))) %>%
  # select(item, name, logit_prop) %>%
  # pivot_wider(names_from = name, values_from = logit_prop) %>%
  # ungroup() %>%
  # select(item, `0_6`:`81+`) %>%
  # pivot_longer(-item) %>%
  # group_by(name) %>%
  # mutate(prop_adjusted = ifelse(value == 0, 0.001,
  #                               ifelse(value == 1, 0.999, value)),
  #        prop_adjusted_logit = log(prop_adjusted/(1-prop_adjusted))) %>%
  # select(item, name, prop_adjusted_logit) %>%
  # pivot_wider(names_from = name, values_from = prop_adjusted_logit) %>%
  select(`0_6`:`no age`)

pca_age <- pca_age1 %>%
  princomp(cor = TRUE)

Now we inspect the components.

summary(pca_age)
## Importance of components:
##                           Comp.1    Comp.2    Comp.3    Comp.4    Comp.5
## Standard deviation     1.6449810 1.4091837 1.1150982 0.9893253 0.7433266
## Proportion of Variance 0.3382453 0.2482248 0.1554305 0.1223456 0.0690668
## Cumulative Proportion  0.3382453 0.5864702 0.7419006 0.8642462 0.9333130
##                            Comp.6     Comp.7 Comp.8
## Standard deviation     0.58743415 0.43407040      0
## Proportion of Variance 0.04313486 0.02355214      0
## Cumulative Proportion  0.97644786 1.00000000      1
pca_age$loadings
## 
## Loadings:
##        Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## 0_6     0.240  0.361         0.682  0.454  0.200         0.314
## 07_17   0.438  0.178  0.344        -0.630 -0.352         0.355
## 18_30   0.340 -0.377  0.308 -0.441  0.251  0.417 -0.100  0.454
## 31_50  -0.211 -0.587 -0.112  0.229  0.199 -0.520  0.281  0.398
## 51_65  -0.485 -0.199         0.317 -0.420  0.357 -0.485  0.291
## 66_80  -0.476  0.280  0.281 -0.129         0.269  0.687  0.222
## 81+    -0.354  0.390  0.304 -0.291  0.320 -0.437 -0.448  0.226
## no age         0.286 -0.775 -0.282                       0.475
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.125  0.125  0.125  0.125  0.125  0.125  0.125  0.125
## Cumulative Var  0.125  0.250  0.375  0.500  0.625  0.750  0.875  1.000

The summary shows the first component captures almost 40% of the variance, the second component captures more than 25% of the variance and the third captures more than 10% of variance.

We will now add the three PC scores to our summary data set.

sociolex_ratings_proportion_age_pca <- pca_age$scores %>%
  as_tibble() %>%
  mutate(item = sociolex_ratings_proportion_age$item) %>%
  select(item, Comp.1:Comp.3) %>%
  left_join(sociolex_ratings_proportion_age) %>%
  rename(age_PC1 = Comp.1,
         age_PC2 = Comp.2,
         age_PC3 = Comp.3)
## Joining with `by = join_by(item)`

We will use the mode values to plot the distribution of PC scores, showing how each mode group clusters words that have positive or negative PCs scores.

pca_age_scores_plot <- sociolex_ratings_proportion_age_pca %>%
  pivot_longer(age_PC1:age_PC3) %>%
  ggplot(aes(x = age_mode, y = value, colour = age_mode)) +
  geom_half_point(size = 0.2, alpha = 0.2, side = "l") +
  geom_half_violin(draw_quantiles = c(0.25, 0.5, 0.75), side = "r") +
  geom_hline(yintercept = 0, linetype = 2) +
  scale_y_continuous(breaks = seq(-4, 4, 4)) +
  scale_color_manual(values = age_palette) +
  xlab("Age mode category\n") +
  ylab("Principal component score") +
  facet_grid(~name, labeller = labeller(name = c(age_PC1 = "PC1", age_PC2 = "PC2", age_PC3 = "PC3"))) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 10, angle = 45, hjust = 1),
        axis.title = element_text(size = 12),
        strip.text = element_text(size = 14))

pca_age_scores_plot

ggsave(plot = pca_age_scores_plot, filename = "Plots/pca_age_scores_plot1.png", height = 3, width = 8, dpi = 400)
pca_age_plot1 <- pca_age1 %>%
  mutate(item = sociolex_ratings_proportion_age_pca$item) %>%
  pivot_longer(`0_6`:`no age`) %>%
  left_join(sociolex_ratings_proportion_age_pca %>%
              select(item, age_PC1:age_PC3) %>%
              pivot_longer(age_PC1:age_PC3, names_to = "PC", values_to = "PC_score")) %>%
  ggplot(aes(x = value, y = PC_score, colour = name)) +
  geom_point(size = 0.01, alpha = 0.05) +
  geom_hline(yintercept = 0, linetype = 2) +
  geom_smooth(method = "gam") +
  scale_color_manual(values = age_palette) +
  # scale_x_sqrt() +
  # scale_linetype_manual(values = c("solid", "dotted")) +
  xlab("Weighted proportion") +
  ylab("Principal component score") +
  facet_grid(PC~name, labeller = labeller(PC = c(age_PC1 = "PC1", age_PC2 = "PC2", age_PC3 = "PC3"))) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x = element_text(size = 5),
        plot.margin = margin(0.2, 0.1, 0.1, 0.1, "cm"))
## Joining with `by = join_by(item)`
pca_age_plot1
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'

ggsave(plot = pca_age_plot1, filename = "plots/pca_age_scores.png", width = 10, height = 4, dpi = 400)
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
pca_plot_combined <- plot_grid(
  pca_age_scores_plot, pca_age_plot1,
  labels = "AUTO",
  ncol = 1,
  rel_heights = c(0.48, 0.52)
)
## `geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
pca_plot_combined

ggsave(plot = pca_plot_combined, filename = "plots/pca_plot_combined.png", width = 10, height = 8, dpi = 400)

Correlations between variables

word_ratings_summary_long <- sociolex_ratings_summary %>%
  filter(version == "Words") %>%
  select(item, dimension, mean) %>%
  bind_rows(sociolex_ratings_proportion_age_pca %>%
              select(item:age_PC3) %>%
              pivot_longer(age_PC1:age_PC3, names_to = "dimension", values_to = "mean"))

scatter_reg_curve <- function(data, mapping, ...) {
  p <- ggplot(data = data, mapping = mapping) +
    geom_point(alpha = 0.05, size = 0.1) +
    geom_vline(xintercept = 0, linetype = 2, inherit.aes = FALSE, size = 0.4) +
    geom_hline(yintercept = 0, linetype = 2, inherit.aes = FALSE, size = 0.4) +
    geom_smooth(method = "loess", colour = "red", se = FALSE, size = 0.5) + # Add regression curve
    geom_smooth(method = "lm", colour = "blue", se = FALSE, size = 0.5) + # Add regression curve
    theme_bw()
  
  p
}

cor_func <- function(data, mapping, method, ...){
  x <- eval_data_col(data, mapping$x)
  y <- eval_data_col(data, mapping$y)
  
  corr <- cor(x, y, method=method, use='complete.obs')
  
  
  ggally_text(
    label = as.character(round(corr, 3)), 
    mapping = aes(),
    xP = 0.5, yP = 0.5,
    color = 'black',
    ...
  ) +
    theme_bw(base_line_size = 0)
}

# Creating the correlation matrix plot
plot_matrix <- ggpairs(
  word_ratings_summary_long %>%
    select(item, dimension, mean) %>%
    pivot_wider(names_from = dimension, values_from = mean) %>%
    select(-item),
  lower = list(
    continuous = wrap(scatter_reg_curve)
  ),
  diag = list(continuous = function(data, mapping, ...) {
                  ggally_densityDiag(data = data, mapping = mapping) + 
                  theme_bw()}),
  upper = list(continuous = wrap(cor_func, method = "pearson", size = 4, stars = FALSE)),
  )

# Display the plot
plot_matrix

ggsave(plot_matrix, filename = "plots/correlation_plot_matrix.png", height = 8, width = 10, dpi = 400)

Top ratings for each dimension

Negative values

word_ratings_summary_long %>%
  group_by(dimension) %>%
  arrange(mean) %>%
  slice_head(n = 10) %>%
  mutate(mean = paste0(item, " (", round(mean, 2), ")"),
         rank = 1:10) %>%
  select(rank, dimension, mean) %>%
  pivot_wider(names_from = dimension, values_from = mean)

Positive values

word_ratings_summary_long %>%
  group_by(dimension) %>%
  arrange(-mean) %>%
  slice_head(n = 10) %>%
  mutate(mean = paste0(item, " (", round(mean, 2), ")"),
         rank = 1:10) %>%
  select(rank, dimension, mean) %>%
  pivot_wider(names_from = dimension, values_from = mean)

Correlations between words, colour and grayscale

sociolex_ratings_summary2 <- sociolex_ratings_summary %>%
  distinct(item, version, dimension, mean) %>%
  left_join(sociolex_coding) %>%
  filter(source == "multipic") %>%
  mutate(version2 = ifelse(is.na(version2), "Words", version2)) %>%
  mutate(item_ID1 = paste0(item, "_", item_ID)) %>%
  select(item_ID, dimension, mean, version2) %>%
  pivot_wider(names_from = c(dimension, version2), values_from = mean)
sociolex_ratings_summary2a <- sociolex_ratings_summary2 %>%
  select(item_ID, gender_Colour, gender_Gray) %>%
  mutate(dimension = "gender",
         comparison = "colour ~ gray") %>%
  rename(x = 2,
         y = 3) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, gender_Colour, gender_Words) %>%
              mutate(dimension = "gender",
                     comparison = "colour ~ words") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, gender_Gray, gender_Words) %>%
              mutate(dimension = "gender",
                     comparison = "gray ~ words") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, location_Colour, location_Gray) %>%
              mutate(dimension = "location",
                     comparison = "colour ~ gray") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, location_Colour, location_Words) %>%
              mutate(dimension = "location",
                     comparison = "colour ~ words") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, location_Gray, location_Words) %>%
              mutate(dimension = "location",
                     comparison = "gray ~ words") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, political_Colour, political_Gray) %>%
              mutate(dimension = "political",
                     comparison = "colour ~ gray") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, political_Colour, political_Words) %>%
              mutate(dimension = "political",
                     comparison = "colour ~ words") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, political_Gray, political_Words) %>%
              mutate(dimension = "political",
                     comparison = "gray ~ words") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, valence_Colour, valence_Gray) %>%
              mutate(dimension = "valence",
                     comparison = "colour ~ gray") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, valence_Colour, valence_Words) %>%
              mutate(dimension = "valence",
                     comparison = "colour ~ words") %>%
              rename(x = 2,
                     y = 3)) %>%
  bind_rows(sociolex_ratings_summary2 %>%
              select(item_ID, valence_Gray, valence_Words) %>%
              mutate(dimension = "valence",
                     comparison = "gray ~ words") %>%
              rename(x = 2,
                     y = 3))
correlation_comparison_values <- sociolex_ratings_summary2a %>%
  group_by(dimension, comparison) %>%
  summarise(cor = paste0("r = ", str_sub(round(stats::cor(x, y), 2), 2)),
            cor_p = round(stats::cor.test(x, y)$p.value, 5))

correlation_comparison_values %>%
  paged_table()
sociolex_ratings_summary2a %>%
                filter(item_ID %in% c("image_19", "image_33")) %>%
                mutate(letter = ifelse(item_ID == "image_33", "A", "B")) %>%
  arrange(item_ID)
correlation_comparison_plot <- sociolex_ratings_summary2a %>%
  ggplot(aes(x = x, y = y)) +
  geom_point(alpha = 0.1, size = 0.1) +
  geom_vline(xintercept = 0, linetype = 2, inherit.aes = FALSE, size = 0.4) +
  geom_hline(yintercept = 0, linetype = 2, inherit.aes = FALSE, size = 0.4) +
  geom_smooth(method = "lm", colour = "blue", se = FALSE, size = 0.5) + # Add regression curve
  geom_text(data = sociolex_ratings_summary2a %>%
                filter(item_ID %in% c("image_19", "image_33")) %>%
                mutate(letter = ifelse(item_ID == "image_33", "A", "B")),
            aes(label = letter), colour = "red") +
  geom_text(data = correlation_comparison_values,
            aes(x = -3, y = 3, label = cor), hjust = 0, vjust = 1) +
  scale_x_continuous(limits = c(-3, 3), breaks = seq(-3, 3, 1)) +
  scale_y_continuous(limits = c(-3, 3), breaks = seq(-3, 3, 1)) +
  facet_grid(dimension~comparison) +
  theme_bw()

correlation_comparison_plot
## `geom_smooth()` using formula = 'y ~ x'

ggsave(plot = correlation_comparison_plot, filename = "plots/correlation_comparisons.png", width = 7, height = 9, dpi = 400)
## `geom_smooth()` using formula = 'y ~ x'
sociolex_ratings_summary <- sociolex_ratings_summary %>%
  select(-version2) %>%
  left_join(sociolex_coding %>%
              mutate(version2 = ifelse(is.na(version2), "Word", version2)) %>%
              filter(multiple_word != 1))
## Joining with `by = join_by(item, version)`

Save the data

write_delim(sociolex_ratings_summary,
            "sociolex_data/summary/sociolex_data_summary.csv",
            delim = ",")

write_delim(sociolex_ratings_proportion_age_pca %>%
              rename(prop_mode = prop,
                     n_mode = n) %>%
              left_join(words_coding) %>%
              select(-sum),
            "sociolex_data/summary/sociolex_data_summary_age_words.csv",
            delim = ",")
## Joining with `by = join_by(item)`
write_delim(sociolex_ratings_summary_proportions,
            "sociolex_data/summary/sociolex_data_summary_proportions.csv",
            delim = ",")